In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask.diagnostics import ProgressBar
from datetime import timedelta
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots


import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa import stattools
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from dateutil.parser import parse
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
In [2]:
import plotly.offline as po
po.init_notebook_mode()
In [3]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

1. Data Exploration and Preprocessing¶

1.a¶

In [ ]:
# !wget -q --no-check-certificate '' -O "data.zip"
!unzip -q "/content/drive/MyDrive/Colab Notebooks/nyc_taxi_data.zip" -d "/content/drive/MyDrive/Colab Notebooks"
In [ ]:
# df = dd.read_parquet('/content/nyc_taxi_alt', engine='pyarrow')
df = dd.read_parquet('/content/drive/MyDrive/Colab Notebooks/nyc_taxi_alt', engine='pyarrow')
In [ ]:
df.columns
Out[ ]:
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')
In [ ]:
df.shape[0].compute(), df.shape[1]
Out[ ]:
(179311700, 19)
In [ ]:
df.dtypes
In [ ]:
with ProgressBar():
    missing_values = df.isnull().sum().compute()
print("missing values:\n")
missing_values
[########################################] | 100% Completed | 115.73 s
missing values:

Out[ ]:
VendorID                         0
tpep_pickup_datetime             0
tpep_dropoff_datetime            0
passenger_count            3872186
trip_distance                    0
RatecodeID                 3872186
store_and_fwd_flag         3872186
PULocationID                     0
DOLocationID                     0
payment_type                     0
fare_amount                      0
extra                            0
mta_tax                          0
tip_amount                       0
tolls_amount                     0
improvement_surcharge            0
total_amount                     0
congestion_surcharge       8660504
airport_fee              115760994
dtype: int64
In [ ]:
columns_to_drop_null = ['passenger_count', 'RatecodeID', 'store_and_fwd_flag', 'congestion_surcharge']
In [ ]:
def fill_nan(partition):
    return partition.fillna(0)
def drop_column(partition, column):
    return partition.drop(column, axis=1)
def drop_null_values(partition):
    return partition.dropna(subset=columns_to_drop_null)

filtered_df = df.map_partitions(drop_null_values)
filtered_df = filtered_df.map_partitions(drop_column, column='airport_fee')
filtered_df = filtered_df.map_partitions(drop_column, column='mta_tax')
filtered_df = filtered_df.map_partitions(drop_column, column='improvement_surcharge')

لأن العمود airport_fee معظم قيمه Null، والأعمدة mta_tax وimprovement_surcharge معظم قيمها ثابتة.

الأعمدة التالية passenger_count، RatecodeID، store_and_fwd_flag، congestion_surcharge تحتوي قيم فارغة لذا حذفنا الأسطر الفارغة على أساسها.
In [ ]:
output_dir = '/content/drive/MyDrive/Colab Notebooks/filtered_data.parquet'
In [ ]:
with ProgressBar():
    filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 211.98 s
In [ ]:
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
In [ ]:
def preprocess_date(partition):
    return partition.assign(
        pickup_before_dropoff=(
            (partition['tpep_pickup_datetime'].dt.year >= 2019) & (partition['tpep_pickup_datetime'].dt.year <= 2022) &
        (partition['tpep_dropoff_datetime'].dt.year >= 2019) & (partition['tpep_dropoff_datetime'].dt.year <= 2022) &
        (partition['tpep_pickup_datetime'] < partition['tpep_dropoff_datetime'])
        )
    )
  • حذف الرحلات المسجلة بتاريخ tpep_pickup_datetime أو tpep_dropoff_datetime قبل 2019 أو بعد 2022.
  • حذف الرحلات التي تاريخ انطلاقها tpep_pickup_datetime يسبق تاريخ انتهاءها tpep_dropoff_datetime.
In [ ]:
filtered_df['tpep_pickup_datetime'] = dd.to_datetime(filtered_df['tpep_pickup_datetime'])
filtered_df['tpep_dropoff_datetime'] = dd.to_datetime(filtered_df['tpep_dropoff_datetime'])
filtered_df = filtered_df.map_partitions(preprocess_date)
In [ ]:
def drop_rows(partition):
    return partition[partition['pickup_before_dropoff']]
In [ ]:
filtered_df = filtered_df.map_partitions(drop_rows)
In [ ]:
with ProgressBar():
    filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 222.07 s
In [ ]:
with ProgressBar():
    negative_counts = filtered_df[
    (filtered_df['extra'] < 0) |
    (filtered_df['tip_amount'] < 0) | (filtered_df['tolls_amount'] < 0) |
    (filtered_df['total_amount'] < 0) |
    (filtered_df['congestion_surcharge'] < 0)
    ].count().compute()
negative_counts
In [ ]:
def drop_negative_rows(partition):
    return partition[(partition['extra'] >= 0) &
     (partition['passenger_count'] >= 0) &
      (partition['trip_distance'] >= 0) &
       (partition['tip_amount'] >= 0) &
        (partition['tolls_amount'] >= 0) & (partition['total_amount'] >= 0) &
         (partition['congestion_surcharge'] >= 0)]

filtered_df = filtered_df.map_partitions(drop_negative_rows)
حذف الأسطر التي تحوي قيماً سالبة في أحد الأعمدة التالية: extra، passenger_count، trip_distance، tip_amount، tolls_amount، congestion_surcharge، total_amount لأن ليس منطقياً أن تحوي قيماً سالبة (كمية، مسافة، وقت، ...).
In [ ]:
with ProgressBar():
    filtered_df = filtered_df.repartition(partition_size="64MB")
    filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 71.11 s
[########################################] | 100% Completed | 167.56 s
In [ ]:
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
In [ ]:
with ProgressBar():
    stats = filtered_df.describe().compute()
stats
[########################################] | 100% Completed | 153.57 s
Out[ ]:
VendorID passenger_count trip_distance RatecodeID PULocationID DOLocationID payment_type fare_amount extra tip_amount tolls_amount total_amount congestion_surcharge
count 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08 1.742050e+08
mean 1.669321e+00 1.488854e+00 3.109136e+00 1.151356e+00 1.641665e+02 1.620703e+02 1.259357e+00 1.354792e+01 1.097812e+00 2.363918e+00 4.014292e-01 1.976463e+01 2.258285e+00
std 4.704577e-01 1.113451e+00 3.643040e+01 3.020670e+00 6.568577e+01 7.027044e+01 4.608893e-01 1.730048e+02 3.790361e+01 1.111647e+01 1.803824e+00 1.933940e+02 7.401891e-01
min 1.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -6.575000e+02 -7.500000e+00 -1.100000e+02 -4.950000e+01 -6.625000e+02 -2.500000e+00
25% 2.000000e+00 1.000000e+00 1.210000e+00 1.000000e+00 1.370000e+02 1.320000e+02 1.000000e+00 8.500000e+00 0.000000e+00 1.000000e+00 0.000000e+00 1.280000e+01 2.500000e+00
50% 2.000000e+00 1.000000e+00 2.090000e+00 1.000000e+00 1.630000e+02 1.630000e+02 1.000000e+00 1.200000e+01 1.000000e+00 2.360000e+00 0.000000e+00 1.716000e+01 2.500000e+00
75% 2.000000e+00 2.000000e+00 4.200000e+00 1.000000e+00 2.360000e+02 2.360000e+02 2.000000e+00 2.400000e+01 2.500000e+00 4.550000e+00 0.000000e+00 3.324000e+01 2.500000e+00
max 2.000000e+00 1.120000e+02 1.843408e+05 9.900000e+01 2.650000e+02 2.650000e+02 5.000000e+00 9.983100e+05 5.000008e+05 1.414920e+05 9.565500e+02 1.084772e+06 4.500000e+00

outlier¶

In [ ]:
# Define the function for outlier detection
def detect_outliers(partition, column_name, z_score_threshold):
  mean = stats.loc['mean', column_name]
  std = stats.loc['std', column_name]

  partition['z_score'] = (partition[column_name] - mean) / std

  outliers = partition[abs(partition['z_score']) > z_score_threshold]

  return outliers
In [ ]:
# Define the function for outlier detection
def delete_outliers(partition, column_name, z_score_threshold):
  mean = stats.loc['mean', column_name]
  std = stats.loc['std', column_name]

  z = (partition[column_name] - mean) / std

  outliers = partition[abs(z) <= z_score_threshold]

  return outliers
trip_distance¶
In [ ]:
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='trip_distance', z_score_threshold=2)
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
[########################################] | 100% Completed | 77.52 s
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
[########################################] | 100% Completed | 74.74 s
In [ ]:
z_score.reset_index()
In [ ]:
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='trip_distance', z_score_threshold=6)
with ProgressBar():
    filtered_df = filtered_df.repartition(partition_size="64MB")
[########################################] | 100% Completed | 81.24 s
In [ ]:
with ProgressBar():
    filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 219.96 s
passenger_count¶
In [ ]:
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='passenger_count', z_score_threshold=1)
In [ ]:
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
In [ ]:
z_score.reset_index()
In [ ]:
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='passenger_count', z_score_threshold=10)
with ProgressBar():
    filtered_df = filtered_df.repartition(partition_size="64MB")
[########################################] | 100% Completed | 77.49 s
In [ ]:
with ProgressBar():
    filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 189.43 s
fare_amount¶
In [ ]:
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='fare_amount', z_score_threshold=1)
In [ ]:
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
In [ ]:
z_score.reset_index()
In [ ]:
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='fare_amount', z_score_threshold=100)
with ProgressBar():
    filtered_df = filtered_df.repartition(partition_size="64MB")
[########################################] | 100% Completed | 88.65 s
In [ ]:
with ProgressBar():
    filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 200.32 s
In [ ]:
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
extra¶
In [ ]:
filtered_df2 = filtered_df2.map_partitions(detect_outliers, column_name='extra', z_score_threshold=1)
In [ ]:
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
In [ ]:
filtered_df = filtered_df.map_partitions(delete_outliers, column_name='extra', z_score_threshold=100)
In [ ]:
with ProgressBar():
    filtered_df = filtered_df.repartition(partition_size="64MB")
    filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 104.70 s
[########################################] | 100% Completed | 214.02 s
In [ ]:
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
tip_amount, tolls_amount, total_amount, congestion_surcharge¶
In [ ]:
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='tip_amount', z_score_threshold=1)
In [ ]:
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
In [ ]:
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='tolls_amount', z_score_threshold=1)
In [ ]:
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
In [ ]:
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='total_amount', z_score_threshold=1)
In [ ]:
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
In [ ]:
filtered_df2 = filtered_df.map_partitions(detect_outliers, column_name='congestion_surcharge', z_score_threshold=1)
In [ ]:
with ProgressBar():
    filtered_df2 = filtered_df2.repartition(partition_size="64MB")
In [ ]:
with ProgressBar():
  z_score = filtered_df2['z_score'].value_counts(dropna=False).compute()
z_score.reset_index()
In [ ]:
with ProgressBar():
    filtered_df.to_parquet(output_dir, engine='pyarrow')
filtered_df = dd.read_parquet(output_dir, engine='pyarrow')
حذف القيم المتطرفة

تم اكتشاف القيم المتطرفة باستخدام معيار ال z-score الذي يمثل نسبة انحراف القيمة عن الانحراف المعياري. حيث وجدنا انحرافاً في بعض القيم في الأعمدة التالية:

  • trip_distance

تم حذف القيم التي انحرافها أكبر من 10 لأنها حالات نادرة وربما تؤثر قيمها في الدراسة التحليلية.

  • passenger_count

تم حذف قيمتين بانحراف أكبر من 6 حيث انحرافهما (84 و98) (وهو انحراف كبير بالنسبة لعدد ركاب تكسي أجرة🙃).

  • fare_amount

تم حذف القيم التي انحرافها أكبر من 100 نظراً لأن معظم القيم انحرافها متقارب وأقل من هذه القيمة عدا بعض القيم الشاذة.

  • Extra

تم حذف القيم التي انحرافاتها أكبر من 100 (اعتبرنا الحد 100 قيمة منطقية مقارنة بباقي القيم ولأن القيمة نقدية)، وهي قيمة وحيدة تحقق الحالة (13191.35).

حذف القيم الخارجة عن التصنيف المحدد
¶

In [ ]:
with ProgressBar():
    rate_code_counts = filtered_df['RatecodeID'].value_counts(dropna=False).compute()
    store_fwd_flag_counts = filtered_df['store_and_fwd_flag'].value_counts(dropna=False).compute()
    payment_type_counts = filtered_df['payment_type'].value_counts(dropna=False).compute()
[########################################] | 100% Completed | 15.39 s
[########################################] | 100% Completed | 30.57 s
[########################################] | 100% Completed | 16.67 s
In [ ]:
expected_rate_codes = [1, 2, 3, 4, 5, 6]
unexpected_rate_codes = rate_code_counts[~rate_code_counts.index.isin(expected_rate_codes)]
unexpected_rate_codes
Out[ ]:
99.0    163346
Name: RatecodeID, dtype: int64
In [ ]:
expected_store_fwd_flags = ['Y', 'N']
unexpected_store_fwd_flags = store_fwd_flag_counts[~store_fwd_flag_counts.index.isin(expected_store_fwd_flags)]
In [ ]:
expected_payment_types = [1, 2, 3, 4, 5, 6]
unexpected_payment_types = payment_type_counts[~payment_type_counts.index.isin(expected_payment_types)]
In [ ]:
filtered_df = filtered_df.map_partitions(
    lambda partition: partition[
        ~partition['RatecodeID'].isin(unexpected_rate_codes.index) &
        ~partition['store_and_fwd_flag'].isin(unexpected_store_fwd_flags.index) &
        ~partition['payment_type'].isin(unexpected_payment_types.index)
    ]
)
In [ ]:
filtered_df = filtered_df.map_partitions(lambda partition: partition.drop_duplicates())
In [ ]:
with ProgressBar():
    filtered_df = filtered_df.repartition(partition_size="64MB")
    filtered_df.to_parquet(output_dir, engine='pyarrow')
[########################################] | 100% Completed | 100.92 s
[########################################] | 100% Completed | 187.63 s
  • حذف قيم العمود RatecodeID، عدا القيم المحددة للتصنيف ([1, 2, 3, 4, 5, 6]).

  • حذف قيم العمود store_and_fwd_flag، عدا القيمتين (['Y', 'N'])

  • حذف قيم العمود payment_type، عدا القيم المحددة للتصنيف ([1, 2, 3, 4, 5, 6])

Cleaned Data¶

In [ ]:
cleaned_data = dd.read_parquet(output_dir, engine='pyarrow')
In [ ]:
with ProgressBar():
    shape = cleaned_data.shape[0].compute()
shape
[########################################] | 100% Completed | 61.72 s
Out[ ]:
174041065
In [ ]:
cleaned_data.columns

Time Series¶

In [ ]:
with ProgressBar():
  cleaned_data['pickup_hour'] = cleaned_data['tpep_pickup_datetime'].dt.hour
  cleaned_data['pickup_day'] = cleaned_data['tpep_pickup_datetime'].dt.day
  cleaned_data['pickup_month'] = cleaned_data['tpep_pickup_datetime'].dt.month
  cleaned_data['pickup_year'] = cleaned_data['tpep_pickup_datetime'].dt.year
  result_df = cleaned_data.groupby(['pickup_year', 'pickup_month', 'pickup_day', 'pickup_hour', 'VendorID']).agg({'total_amount': 'sum', 'VendorID': 'count'}).compute()
[########################################] | 100% Completed | 93.75 s
In [ ]:
result_df = result_df.rename(columns={'VendorID': 'trip_count', 'total_amount': 'total_revenue'}).reset_index()
In [ ]:
result_df.pickup_year.unique()
Out[ ]:
array([2019, 2020, 2021, 2022])
In [ ]:
result_df['datetime'] = pd.to_datetime(result_df[['pickup_year', 'pickup_month', 'pickup_day', 'pickup_hour']].astype(str).agg('-'.join, axis=1) + ':00:00')
In [ ]:
result_df.set_index('datetime', inplace=True)
In [ ]:
df_resampled = result_df.groupby('VendorID').resample('H').agg(
    {'total_revenue': 'sum', 'trip_count': 'sum'}).reset_index()
In [ ]:
df_resampled.set_index('datetime', inplace=True)
df_resampled.sort_index(inplace=True)
df_resampled.index
Out[ ]:
DatetimeIndex(['2019-01-01 07:00:00', '2019-01-01 08:00:00',
               '2019-01-01 09:00:00', '2019-01-01 10:00:00',
               '2019-01-01 11:00:00', '2019-01-01 12:00:00',
               '2019-01-01 13:00:00', '2019-01-01 14:00:00',
               '2019-01-01 15:00:00', '2019-01-01 16:00:00',
               ...
               '2022-12-31 19:00:00', '2022-12-31 19:00:00',
               '2022-12-31 20:00:00', '2022-12-31 20:00:00',
               '2022-12-31 21:00:00', '2022-12-31 21:00:00',
               '2022-12-31 22:00:00', '2022-12-31 22:00:00',
               '2022-12-31 23:00:00', '2022-12-31 23:00:00'],
              dtype='datetime64[ns]', name='datetime', length=69666, freq=None)
In [ ]:
df_resampled.to_csv('/content/drive/MyDrive/Colab Notebooks/time_series.csv', index=True)

1.b¶

ضبط آلة الزمن

Check if there are any missing values in the resampled data

In [ ]:
missing_values = df_resampled.isnull().sum()
print("missing values:", missing_values)
missing values: VendorID         0
total_revenue    0
trip_count       0
dtype: int64

Verify that the time intervals between consecutive data points are regular

In [ ]:
time_diff = df_resampled.index[1] - df_resampled.index[0]
freq = str(time_diff)
print("Inferred Frequency:", freq)
Inferred Frequency: 0 days 01:00:00
In [ ]:
time_series = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/time_series.csv')
In [ ]:
df_long = pd.melt(time_series, id_vars=['datetime'], value_vars=['VendorID', 'total_revenue', 'trip_count'])

# Save the long format DataFrame to a CSV file
df_long.to_csv('/content/drive/MyDrive/Colab Notebooks/long_format_time_series.csv', index=False)

2. Features Engineering¶

add source_zone, destination_zone, source_borough, destination_borough featues¶

In [ ]:
zone_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/taxi_zone_lookup.csv')
zone_data.columns
Out[ ]:
Index(['LocationID', 'Borough', 'Zone', 'service_zone'], dtype='object')
In [ ]:
def map_zones(partition, column, new_column1, new_column2):
    partition = partition.merge(zone_data[['LocationID', 'Zone', 'Borough']], left_on=column, right_on='LocationID', how='inner')
    partition = partition.rename(columns={'Zone': new_column1, 'Borough': new_column2})
    return partition
In [ ]:
cleaned_data_with_features = cleaned_data.map_partitions(map_zones, column = 'PULocationID',
                                                         new_column1='source_zone',new_column2='source_borough')
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_zones, column = 'DOLocationID',
                                                                       new_column1='destination_zone',new_column2='destination_borough')
cleaned_data_with_features.columns
Out[ ]:
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'tip_amount', 'tolls_amount', 'total_amount', 'congestion_surcharge',
       'pickup_before_dropoff', 'pickup_hour', 'pickup_day', 'pickup_month',
       'pickup_year', 'LocationID_x', 'source_zone', 'source_borough',
       'LocationID_y', 'destination_zone', 'destination_borough'],
      dtype='object')
In [ ]:
columns_to_drop = ['LocationID_y', 'LocationID_x']

def drop_columns(partition):
    return partition.drop(columns_to_drop, axis=1)

cleaned_data_with_features = cleaned_data_with_features.map_partitions(drop_columns)

location_pair¶

In [ ]:
def create_location_pair(partition):
    partition['location_pair'] = partition[['source_zone', 'destination_zone']].apply(
        lambda x: ','.join(sorted(x)), axis=1
    )

    return partition
In [ ]:
cleaned_data_with_features = cleaned_data_with_features.map_partitions(create_location_pair)

add payment_type_name, vendor, rate_code features¶

In [ ]:
payment_type_mapping = {
    1: 'Credit card',
    2: 'Cash',
    3: 'No charge',
    4: 'Dispute',
    5: 'Unknown',
    6: 'Voided trip'
}
def map_payment_type_name(partition):
    partition['payment_type_name'] = partition['payment_type'].map(payment_type_mapping)

    return partition
In [ ]:
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_payment_type_name)
In [ ]:
vendor_mapping = {
    1: 'Creative Mobile Technologies, LLC',
    2: 'VeriFone Inc.'
}

def map_vendor(partition):
    partition['vendor'] = partition['VendorID'].map(vendor_mapping)
    return partition
In [ ]:
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_vendor)
In [ ]:
rate_code_mapping = {
    1: 'Standard rate',
    2: 'JFK',
    3: 'Newark',
    4: 'Nassau or Westchester',
    5: 'Negotiated fare',
    6: 'Group ride'
}

def map_rate_code(partition):
    partition['rate_code'] = partition['RatecodeID'].map(rate_code_mapping)
    return partition
In [ ]:
cleaned_data_with_features = cleaned_data_with_features.map_partitions(map_rate_code)
In [4]:
output_cleaned_data_with_features= "/content/drive/MyDrive/Colab Notebooks/cleaned_data_with_features.parquet"
In [ ]:
with ProgressBar():
    cleaned_data_with_features.to_parquet(output_cleaned_data_with_features, engine='pyarrow')
cleaned_data_with_features = dd.read_parquet(output_cleaned_data_with_features, engine='pyarrow')
[########################################] | 100% Completed | 20m 41s

trip_class feature¶

In [ ]:
with ProgressBar():
    location_pair_counts = cleaned_data_with_features['location_pair'].value_counts(dropna=False).compute()
[########################################] | 100% Completed | 33.33 s
In [ ]:
location_pair_counts_df = location_pair_counts.reset_index().rename(columns={"index":'location_pair', "location_pair": 'count'})
In [ ]:
max_count = location_pair_counts_df['count'].max()
mean_count = location_pair_counts_df['count'][1:].mean()
min_count = location_pair_counts_df['count'].min()
quartiles = np.round(np.percentile([min_count, mean_count, max_count/2], [10, 53, 60, 100]))
thresholds = [
    min_count,
    int(quartiles[0]), #rare
    int(quartiles[1]), #less-common
    int(quartiles[2]), #common
    max_count #more-common
]
print(thresholds)
[1, 1172, 71088, 223296, 2186115]
In [ ]:
(location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[0]) & (location_pair_counts_df['count'] < thresholds[1]))]['count'].sum() ,
 location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[1]) & (location_pair_counts_df['count'] < thresholds[2]))]['count'].sum() ,
 location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[2]) & (location_pair_counts_df['count'] < thresholds[3]))]['count'].sum(),
 location_pair_counts_df[((location_pair_counts_df['count'] > thresholds[3]) & (location_pair_counts_df['count'] < thresholds[4]))]['count'].sum()
 )
Out[ ]:
(2985659, 41283605, 58510933, 68556780)
In [ ]:
def assign_trip_class(partition):
    partition['trip_class'] = pd.cut(partition['location_pair'].map(location_pair_counts), bins=thresholds, labels=['rare', 'less-common', 'common', 'more-common'])
    return partition

cleaned_data_with_features = cleaned_data_with_features.map_partitions(assign_trip_class)

trip_duration feature¶

In [ ]:
cleaned_data_with_features['trip_duration'] = (cleaned_data_with_features['tpep_dropoff_datetime'] - cleaned_data_with_features['tpep_pickup_datetime']) / timedelta(minutes=1)

trip_distance_km¶

In [ ]:
cleaned_data_with_features['trip_distance_km'] = cleaned_data_with_features['trip_distance'] * 1.609344
In [ ]:
cleaned_data_with_features.columns
In [ ]:
with ProgressBar():
    cleaned_data_with_features.to_parquet(output_cleaned_data_with_features, engine='pyarrow')
[########################################] | 100% Completed | 427.90 s

3. Exploration and analysis¶

In [4]:
featured_data = dd.read_parquet('nytchallenge/featured_data.parquet', engine='pyarrow')

3.a Beyond the edge of numbers¶

In [5]:
with ProgressBar():
    vendor_counts = featured_data['vendor'].value_counts().compute()
[########################################] | 100% Completed | 74.71 s
In [6]:
fig = px.pie(vendor_counts, values=vendor_counts.values, names=vendor_counts.index, title='Vendor Share')
fig.show()
نلاحظ أن حصة الوكالة VeriFone Inc من السوق أكبر من حصة الوكالة الأخرى بمقدار الضعف تقريباً.

بعد إجراء بعض عمليات البحث وجدنا أن وكالة verifone inc هي الأقدم في السوق وقد تأسست بتاريخ 1992 باسم مختلف من قبل سائق تكسي أساساً وقد أصبح اليوم اسمها curb وقد دخلت إلى تكنولوجيا تطبيقات حجز الرحلات لتنافس شركات uber و lyft وأخيراً تعاقدت مع شركة uber في شهر آذار عام 2022 مما قد يكون سببا في سرطتها على السوق مقارنةً بشركة creative mobile technology, llc والتي تأسست في عام 2005 والتي تمتلك العديد من الحلول التكنولوجية مثل تطبيقات الهواتف المحمولة ومواقع الويب وغيرها.

In [ ]:
pio.write_image(fig, 'Vendor Share.png', format='png')

ii¶

view the share of each Borough (source_borough, destination_borough)

In [7]:
with ProgressBar():
    source_borough_counts = featured_data['source_borough'].value_counts().compute()
    destination_borough_counts = featured_data['destination_borough'].value_counts().compute()
[########################################] | 100% Completed | 75.75 s
[########################################] | 100% Completed | 74.41 s
In [8]:
borough_counts = pd.concat([source_borough_counts, destination_borough_counts])
In [9]:
fig = px.pie(borough_counts, values=borough_counts.values, names=borough_counts.index, title='Borough Share')
fig.show()
نلاحظ أن بلدية Manhattan تسيطر على السوق.

تُعدّ منهاتن الأكثر شهرة بين جميع البلديات والأكثر ازدحاماً والأكثر زيارةً وقد يكون هذا السبب المحتمل لسيطرتها على السوق.

In [ ]:
pio.write_image(fig, 'Borough Share.png', format='png')

iii¶

In [10]:
with ProgressBar():
    payment_type_counts = featured_data['payment_type_name'].value_counts().compute()
[########################################] | 100% Completed | 73.68 s
In [11]:
fig = px.bar(payment_type_counts, x=payment_type_counts.index, y=payment_type_counts.values, title='Payment Methods')
fig.update_layout(
    yaxis=dict(
        title='Count'
    )
)
fig.show()
معظم طرق الدفع لسيارات الأجرة في نيويورك تتم عن طريق ال credit card.

وذلك يعود إلى أن كلتا الوكالتين هي من الوكالات التكنولوجية الحديثة كما أنه وبعد إجراء القليل من البحث تبيّن لنا أنّ أغلب سائقي سيارات الأجرة في مدينة نيويورك يعتمدون الـ credit cards كوسيلة دفع.

In [ ]:
pio.write_image(fig, 'Payment Methods.png', format='png')

iV¶

In [12]:
with ProgressBar():
    payment_type_trip_class_counts = featured_data.groupby(['payment_type_name', 'trip_class']).size().reset_index().rename(columns={0: 'count'}).compute()
[########################################] | 100% Completed | 383.37 s
In [13]:
fig = px.sunburst(payment_type_trip_class_counts, path=['trip_class', 'payment_type_name'], values='count')
fig.show()
In [ ]:
pio.write_image(fig, 'payment type of trip class.png', format='png')

V¶

In [14]:
with ProgressBar():
    correlation_matrix = featured_data[['total_amount', 'tip_amount', 'fare_amount', 'passenger_count', 'trip_duration', 'trip_distance']].corr().compute()
correlation_matrix
[########################################] | 100% Completed | 161.47 s
Out[14]:
total_amount tip_amount fare_amount passenger_count trip_duration trip_distance
total_amount 1.000000 0.666808 0.976456 0.014598 0.148588 0.845064
tip_amount 0.666808 1.000000 0.521965 0.001175 0.080030 0.528000
fare_amount 0.976456 0.521965 1.000000 0.015325 0.150626 0.827287
passenger_count 0.014598 0.001175 0.015325 1.000000 0.015952 0.017038
trip_duration 0.148588 0.080030 0.150626 0.015952 1.000000 0.153610
trip_distance 0.845064 0.528000 0.827287 0.017038 0.153610 1.000000
In [15]:
heatmap_trace = go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis'
)
layout = go.Layout(
    title='Correlation Matrix Heatmap',
)
fig = go.Figure(data=[heatmap_trace], layout=layout)
fig.show()

نلاحظ أن المسافة مرتبطة جداً بالأجرة الكلية والأجرة الفعلية، وبالطبع الأجرة الفعلية مرتبطة جدا بالأجرة الكلية. والعلاقة بين عدد الركاب وباقي السمات ضعيف (لأنه ليس لديهم تكسي ركاب 😬). العلاقة بين المسافة ومدة الرحلة ليست مرتبطة جداً لأن هناك عوامل خارجية قد تؤثر على مدة الرحلة (السرعة، الازدحام، المسافات الطويلة غالباً تكون رحلات سفر وحدود السرعة فيها عالية لخلوها من الازدحام الشديد، ...)

In [ ]:
pio.write_image(fig, 'Correlation.png', format='png')

vi¶

In [ ]:
subset_data = featured_data[['trip_class', 'total_amount', 'trip_duration', 'trip_distance']]
In [ ]:
with ProgressBar():
    stats_per_class = subset_data.groupby('trip_class').agg({
    'total_amount': ['mean', 'median', 'std'],
    'trip_duration': ['mean', 'median', 'std'],
    'trip_distance': ['mean', 'median', 'std']
}, shuffle='tasks').compute()
In [ ]:
stats_per_class = stats_per_class.stack().reset_index()
In [ ]:
stats_per_class2 = stats_per_class.melt(
    id_vars=['trip_class', 'level_1'],
    value_vars=['total_amount', 'trip_duration', 'trip_distance'],
    var_name='metric',
    value_name='value'
)

title = 'Mean, Median, and Standard Deviation for Total Cost, Trip Duration, and Distance Traveled per Trip Class'

fig = px.bar(
    stats_per_class2,
    x='metric',
    y='value',
    color='value',
    facet_row='trip_class',
    facet_col='level_1',
    title=title,
    barmode='group',
)

fig.update_yaxes(matches='y')
fig.update_layout(height=800)
fig.show()
In [ ]:
pio.write_image(fig, 'statistics DataFrame.png', format='png')

vii¶

In [16]:
time_series_df = pd.read_csv('nytchallenge/time_series.csv', parse_dates=["datetime"], index_col="datetime")
time_series_df.columns
Out[16]:
Index(['VendorID', 'total_revenue', 'trip_count'], dtype='object')
In [17]:
vendor1_data = time_series_df[time_series_df['VendorID'] == 1][['total_revenue','trip_count']]
vendor2_data = time_series_df[time_series_df['VendorID'] == 2][['total_revenue','trip_count']]

Autocorrelation, Partial autocorrelation¶

*Autocorrelation*

is just the correlation of the data with itself. Autocorrelation measures a set of current values against a set of past values to see if they correlate. It is heavily used in time series analysis and forecasting

In [18]:
acf_values = acf(vendor1_data['total_revenue'], nlags=48)
lags = np.arange(24*2)
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Scatter(x=lags, y=acf_values, mode='lines', name='ACF'), row=1, col=1)
fig.update_layout(height=600, width=800, title_text='Vendor 1 Autocorrelation Function - Total Revenue')
fig.update_xaxes(title_text='Lag')
fig.update_yaxes(title_text='Autocorrelation')
fig.show()
In [19]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
plot_acf(vendor1_data['total_revenue'], lags=48, ax=axes[0, 0], alpha=0.5)
axes[0, 0].set_title('Autocorrelation for Vendor 1 - Total Revenue')

plot_acf(vendor2_data['total_revenue'], lags=48, ax=axes[0, 1], alpha=0.5)
axes[0, 1].set_title('Autocorrelation for Vendor 2 - Total Revenue')

plot_pacf(vendor1_data['total_revenue'], lags=48, ax=axes[1, 0], alpha=0.5)
axes[1, 0].set_title('Partial Autocorrelation for Vendor 1 - Total Revenue')

plot_pacf(vendor2_data['total_revenue'], lags=48, ax=axes[1, 1], alpha=0.5)
axes[1, 1].set_title('Partial Autocorrelation for Vendor 2 - Total Revenue')

plt.tight_layout()
plt.show()

Rolling Window¶

a window-size equal to the seasonal duration (ex: 12 for a month-wise series), will effectively nullify the seasonal effect.

In [20]:
vendor1_smooth = vendor1_data.rolling(window = 24*30, center=True, min_periods=1).mean() # a rolling window of 24*30 hour (1 month)
vendor2_smooth = vendor2_data.rolling(window = 24*30, center=True, min_periods=1).mean() # a rolling window of 24 hour (1 day)

vendor1_smooth_df = pd.DataFrame({'Date': vendor1_data.index, 'Smoothed Total Revenue': vendor1_smooth['total_revenue'], 'Smoothed Trip Count':vendor1_smooth['trip_count']})
vendor2_smooth_df = pd.DataFrame({'Date': vendor2_data.index, 'Smoothed Total Revenue': vendor2_smooth['total_revenue'], 'Smoothed Trip Count':vendor2_smooth['trip_count']})

fig = make_subplots(rows=2, cols=2, subplot_titles=[
    'Smoothed Time Series for Vendor 1 - Total Revenue',
    'Smoothed Time Series for Vendor 1 - Trip Count',
    'Smoothed Time Series for Vendor 2 - Total Revenue',
    'Smoothed Time Series for Vendor 2 - Trip Count'
])

fig.add_trace(go.Scatter(x=vendor1_smooth_df['Date'], y=vendor1_smooth_df['Smoothed Total Revenue']), row=1, col=1)
fig.add_trace(go.Scatter(x=vendor1_smooth_df['Date'], y=vendor1_smooth_df['Smoothed Trip Count']), row=1, col=2)
fig.add_trace(go.Scatter(x=vendor2_smooth_df['Date'], y=vendor2_smooth_df['Smoothed Total Revenue']), row=2, col=1)
fig.add_trace(go.Scatter(x=vendor2_smooth_df['Date'], y=vendor2_smooth_df['Smoothed Trip Count']), row=2, col=2)

fig.update_layout(height=600, width=1200, title_text="Smoothed Time Series Subplots")

fig.show()
In [21]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_data["total_revenue"], mode='lines', name='Original', line=dict(color='#379BDB')))
fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_smooth_df["Smoothed Total Revenue"], mode='lines', name='Rolling Mean', line=dict(color='#D22A0D')))

# Update layout
fig.update_layout(
    title="Rolling Mean & Original",
    xaxis_title="Date",
    yaxis_title="total revenue",
    showlegend=True,
    legend=dict(x=0.02, y=0.98, bgcolor='rgba(255, 255, 255, 0.5)'),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    paper_bgcolor='rgba(0, 0, 0, 0)',
    height=500,
    width=900
)

xcoords = ['2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01',
           '2019-06-01', '2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
          '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01',
           '2020-06-01', '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01',
           '2021-01-01', '2021-02-01', '2021-03-01', '2021-04-01', '2021-05-01',
           '2021-06-01', '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01', '2021-11-01', '2021-12-01'
          '2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01', '2022-05-01',
           '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01', '2022-10-01', '2022-11-01', '2022-12-01']  # List of x-coordinates for vertical lines
for xc in xcoords:
    fig.add_shape(type='line', x0=xc, y0=vendor1_smooth_df['Smoothed Total Revenue'].min(),
                  x1=xc, y1=vendor1_smooth_df['Smoothed Total Revenue'].max(),
                  line=dict(color='black', dash='dash'))
fig.show()

Extra¶

In [22]:
fig = make_subplots(rows=2, cols=1, subplot_titles=[
    'Vendor 1 - Total Revenue and Trip Count',
    'Vendor 2 - Total Revenue and Trip Count'
])

fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_data['total_revenue'], name='Vendor 1 - Total Revenue'), row=1, col=1)

fig.add_trace(go.Scatter(x=vendor1_data.index, y=vendor1_data['trip_count'], name='Vendor 1 - Trip Count'), row=1, col=1)
fig.add_trace(go.Scatter(x=vendor2_data.index, y=vendor2_data['total_revenue'], name='Vendor 2 - Total Revenue'), row=2, col=1)
fig.add_trace(go.Scatter(x=vendor2_data.index, y=vendor2_data['trip_count'], name='Vendor 2 - Trip Count'), row=2, col=1)

fig.update_layout(height=600, width=900, title_text="Vendor Data Subplots", showlegend=True)
fig.show()

قمنا هنا برسم السلسلة الزمنية على مدار شهري، ولاحظنا أنّ هناك انخفاض ملحوظ بعدد الرحلات والإيرادات يتبعه ارتفاع بنسبة قليلة من أجل كلتا الوكالتين ما بين شهري نيسان 4 من عام 2020 وتشرين الأول 9 من عام 2020، وبعد إجراء بعض الأبحاث وجدنا أن فترة فيروس كورونا COVID-19 قد أثرت تأثيراً سلبيّاً كبيراً على وكالات وسائقي سيارات الأجرة بشكل عام حتى أنهم في ايرلاند خرجوا باحتجاجات، وبحسب الإحصائيات فإن أكثر من نصف سائقي سيارات الأجرة لم يعودوا إلى عملهم بعد الجائحة وهذا يفسر أنه حتى بعدما عادت الإيرادات وعدد الرحلات لترتفع من جديد إلا أنها لم تعد كما كانت عليه قبل فترة الجائحة.

In [ ]:
pio.write_image(fig, 'Vendor Data Subplots.png', format='png')
In [31]:
!jupyter nbconvert --to HTML '/content/drive/MyDrive/Colab Notebooks/DmH2.ipynb'
[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/DmH2.ipynb to HTML
/usr/local/lib/python3.10/dist-packages/nbconvert/filters/widgetsdatatypefilter.py:71: UserWarning: Your element with mimetype(s) dict_keys(['application/vnd.plotly.v1+json']) is not able to be represented.
  warn(
[NbConvertApp] Writing 5005210 bytes to /content/drive/MyDrive/Colab Notebooks/DmH2.html

3.b Catching Patterns¶

Vendor 1¶

Stationarity¶

Stationarity is an important characteristic of time series. A time series is said to be stationary if its statistical properties do not change over time. In other words, it has constant mean and variance, and covariance is independent of time. Ideally, we want to have a stationary time series for modelling. if not stationarity we can make different transformations to make them stationary.

adfuller

it test the null hypothesis that a unit root is present. If it is, then p > 0, and the process is not stationary. Otherwise, p = 0, the null hypothesis is rejected, and the process is considered to be stationary.

In [23]:
adf, pvalue, _, _, critical_values, _ = stattools.adfuller(vendor1_data['total_revenue'],
                                                           autolag='AIC')
print(pvalue)
print('Probably ' + ('Non-Stationary' if pvalue > 0.05 else 'Stationary'))
2.8418965620348085e-07
Probably Stationary

Modelling time series¶

In [24]:
train_size = int(len(vendor1_data) * 0.8)  # 80% for training, 20% for testing
train_data = vendor1_data.iloc[:train_size]
test_data = vendor1_data.iloc[train_size:]
seasonal_decompose¶
In [25]:
# Model the trend, seasonality, and residuals using seasonal_decompose
decomposition = seasonal_decompose(train_data['total_revenue'], model='additive', period=24)
trend = decomposition.trend
seasonal = decomposition.seasonal
residuals = decomposition.resid
In [26]:
# Create subplots
fig = make_subplots(rows=3, cols=1, shared_xaxes=True, subplot_titles=['Trend', 'Seasonality', 'Residuals'])

# Add traces for trend, seasonality, and residuals
fig.add_trace(go.Scatter(x=train_data.index, y=trend, mode='lines', name='Trend'), row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=seasonal, mode='lines', name='Seasonality'), row=2, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=residuals, mode='lines', name='Residuals'), row=3, col=1)

# Update layout
fig.update_layout(height=800, width=1200, title_text="Seasonal Decomposition of Vendor 1 - Total Revenue")

# Show the figure
fig.show()
Fit different statistical models and compare their AIC¶
In [28]:
from itertools import product

ps = range(3, 7, 1)
ds = range(1, 2, 1)
qs = range(0, 4, 1)

order_list = list(product(ps, ds, qs))
In [29]:
for i, order in enumerate(order_list):
    order_list[i] = (order[0], order[1] ,order[2])
In [35]:
from typing import Union
from tqdm.notebook import tqdm

def optimize_SARIMAX(endog: Union[pd.Series, list], order_list: list, test_data: Union[pd.Series, list]):

    results = []
    lowest_mse = float('inf')
    best_predictions = None

    for order in tqdm(order_list):
        try:
            sarimax_model = SARIMAX(endog, order=order, simple_differencing=False).fit(disp=False)
            test_predictions = sarimax_model.predict(start=test_data.index[0], end=test_data.index[-1])
        except:
            continue


        aic = sarimax_model.aic
        mse = mean_squared_error(test_data['total_revenue'], test_predictions)
        if mse < lowest_mse:
            lowest_mse = mse
            best_predictions = test_predictions

        results.append([order, aic, mse])

    result_df = pd.DataFrame(results)
    result_df.columns = ['(p, d, q)', 'AIC', 'mse']

    #Sort in ascending order, lower AIC is better
    # result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)

    return result_df, best_predictions
In [31]:
result_df3, best_predictions3 = optimize_SARIMAX(train_data['total_revenue'], order_list, test_data)
result_df3
  0%|          | 0/16 [00:00<?, ?it/s]
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

Out[31]:
(p, d, q) AIC mse
0 (3, 1, 0) 563122.708034 3.056047e+08
1 (3, 1, 1) 563121.817440 3.059815e+08
2 (3, 1, 3) 556066.491065 2.389156e+08
3 (4, 1, 0) 563089.315956 3.099661e+08
4 (4, 1, 2) 561383.434699 3.265998e+08
5 (5, 1, 1) 557991.144863 2.404729e+08
6 (5, 1, 2) 557957.945055 2.404858e+08
7 (6, 1, 0) 561817.851815 2.955761e+08
In [ ]:
min_row_column2 = result_df3.loc[result_df3['mse'].idxmin()]
min_row_column2
Out[ ]:
(p, d, q)           (6, 1, 3)
AIC             556007.501217
mse          238151857.578634
Name: 21, dtype: object
In [37]:
sarimax_model = SARIMAX(train_data['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions3 = sarimax_model.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

In [38]:
best_predictions_df3 = best_predictions3.reset_index()
best_predictions_df3.set_index('index', inplace= True)
In [45]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_data.index, y=train_data['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df3['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='Total Revenue'),
    legend=dict(x=0, y=1),
)

fig.update_xaxes(tickformat="%d")

fig.show()
In [40]:
# Take the first difference to remove to make the process stationary
vendor1_data_diff = vendor1_data - vendor1_data.shift(1)
vendor1_data_diff = vendor1_data_diff.replace([np.inf, -np.inf], np.nan).dropna()
In [41]:
train_size1 = int(len(vendor1_data) * 0.9)  # 80% for training, 20% for testing
train_data1 = vendor1_data.iloc[:train_size1]
test_data1 = vendor1_data.iloc[train_size1:]
In [42]:
# result_df1, best_predictions1 = optimize_SARIMAX(train_data['total_revenue'], [(6, 1, 3)], test_data)
sarimax_model1 = SARIMAX(train_data1['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions1 = sarimax_model1.predict(start=test_data1.index[0], end=test_data1.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

In [43]:
best_predictions_df1 = best_predictions1.reset_index()
In [44]:
# import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=train_data1.index, y=train_data1['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data1.index, y=test_data1['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data1.index, y=best_predictions_df1['predicted_mean'], mode='lines', name='SARIMAX', line=dict(dash='dash')))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='Total Revenue'),
    legend=dict(x=0, y=1),

)

fig.update_xaxes(tickformat="%m")

fig.show()
In [46]:
arima_model = SARIMAX(train_data['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

Components of SARIMAX¶
In [47]:
decomposition = seasonal_decompose(arima_model.fittedvalues, model='additive', period=24)
fig = go.Figure()
fig = make_subplots(rows=4, cols=1, shared_xaxes=False, subplot_titles=['Fitted', 'Trend', 'Seasonal', 'Residual'])
fig.add_trace(go.Scatter(x=train_data.index, y=arima_model.fittedvalues, name='Fitted'),  row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.trend, name='Trend'),row=2, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.seasonal, name='Seasonal'),row=3, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.resid, name='Residual'), row=4, col=1)

fig.update_layout(
    height=800, width=1200,
    title='Components of SARIMAX',
    xaxis_title='Time',
    yaxis_title='Value'
)

fig.show()

Prophet model¶

In [48]:
# Prophet model
prophet_model = Prophet()
prophet_data = train_data.reset_index().rename(columns={'datetime': 'ds', 'total_revenue': 'y'})
prophet_model.fit(prophet_data)
23:50:43 - cmdstanpy - INFO - Chain [1] start processing
23:51:25 - cmdstanpy - INFO - Chain [1] done processing
Out[48]:
<prophet.forecaster.Prophet at 0x21d4b4bf4f0>
In [49]:
prophet_predictions = prophet_model.predict(test_data.reset_index().rename(columns={'datetime': 'ds', 'total_revenue': 'y'}))
prophet_predictions = prophet_predictions['yhat'].values
prophet_mse = mean_squared_error(test_data['total_revenue'], prophet_predictions)

print("Prophet Model Performance:")
print(f"MSE: {prophet_mse}")
Prophet Model Performance:
MSE: 243614190.98605546
In [53]:
# Perform seasonal decomposition
fig = prophet_model.plot_components(prophet_model.predict(test_data.reset_index().rename(columns={'datetime': 'ds', 'total_revenue': 'y'})))
In [54]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_data.index, y=train_data['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=prophet_predictions, mode='lines', name='Prophet'))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='Total Revenue'),
    legend=dict(x=0, y=1),
)

fig.update_xaxes(tickformat="%d")

fig.show()

predict future¶

In [55]:
prophet_future = prophet_model.make_future_dataframe(periods=30)
prophet_forecast = prophet_model.predict(prophet_future)

fig = go.Figure()

fig.add_trace(go.Scatter(x=prophet_forecast.ds, y=prophet_forecast['yhat'].values, mode='lines', name='Prophet'))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='Total Revenue'),
    legend=dict(x=0, y=1),
)

fig.update_xaxes(tickformat="%y")

fig.show()

Trip_count¶

In [56]:
sarimax_model3 = SARIMAX(train_data['trip_count'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions = sarimax_model3.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

In [57]:
best_predictions_df = best_predictions.reset_index()
best_predictions_df.set_index('index', inplace= True)
In [58]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=train_data.index, y=train_data['trip_count'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['trip_count'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='Total Revenue'),
    legend=dict(x=0, y=1),

)

fig.update_xaxes(tickformat="%d")

fig.show()
In [59]:
arima_model = SARIMAX(train_data['trip_count'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

In [60]:
decomposition = seasonal_decompose(arima_model.fittedvalues, model='additive', period=24)

fig = go.Figure()
fig = make_subplots(rows=4, cols=1, shared_xaxes=False, subplot_titles=['Fitted', 'Trend', 'Seasonal', 'Residual'])
fig.add_trace(go.Scatter(x=train_data.index, y=arima_model.fittedvalues, name='Fitted'),  row=1, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.trend, name='Trend'),row=2, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.seasonal, name='Seasonal'),row=3, col=1)
fig.add_trace(go.Scatter(x=train_data.index, y=decomposition.resid, name='Residual'), row=4, col=1)

fig.update_layout(
    height=800, width=1200,
    title='Components of SARIMAX',
    xaxis_title='Time',
    yaxis_title='Value'
)

fig.show()

Vendor 2¶

In [61]:
from statsmodels.tsa import stattools
adf, pvalue, _, _, critical_values, _ = stattools.adfuller(vendor2_data['total_revenue'],
                                                           autolag='AIC')
print('Probably ' + ('Non-Stationary' if pvalue > 0.05 else 'Stationary'))
print(pvalue)
Probably Stationary
5.6842433239422795e-06
In [62]:
train_size = int(len(vendor2_data) * 0.8)  # 80% for training, 20% for testing
train_data = vendor2_data.iloc[:train_size]
test_data = vendor2_data.iloc[train_size:]
In [64]:
sarimax_model4 = SARIMAX(train_data['total_revenue'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions = sarimax_model4.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

In [65]:
best_predictions_df = best_predictions.reset_index()
best_predictions_df.set_index('index', inplace= True)
In [66]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_data.index, y=train_data['total_revenue'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['total_revenue'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='Total Revenue'),
    legend=dict(x=0, y=1),
)

fig.update_xaxes(tickformat="%d")

fig.show()
In [67]:
sarimax_model4 = SARIMAX(train_data['trip_count'], order=(6, 1, 3), simple_differencing=False).fit(disp=False)
best_predictions = sarimax_model4.predict(start=test_data.index[0], end=test_data.index[-1])
D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning:

No frequency information was provided, so inferred frequency H will be used.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\tsa\statespace\sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

D:\5th year\semester 2\dataMining\lib\site-packages\statsmodels\base\model.py:607: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

In [68]:
best_predictions_df = best_predictions.reset_index()
best_predictions_df.set_index('index', inplace= True)
In [ ]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_data.index, y=train_data['trip_count'], name='Total Revenue'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['trip_count'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=best_predictions_df['predicted_mean'], mode='lines', name='ARIMA', line=dict(dash='dash')))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='trip_count'),
    legend=dict(x=0, y=1),
)

fig.update_xaxes(tickformat="%d")

fig.show()
In [70]:
# Prophet model
prophet_model = Prophet()
prophet_data = train_data.reset_index().rename(columns={'datetime': 'ds', 'trip_count': 'y'})
prophet_model.fit(prophet_data)
00:01:09 - cmdstanpy - INFO - Chain [1] start processing
00:01:40 - cmdstanpy - INFO - Chain [1] done processing
Out[70]:
<prophet.forecaster.Prophet at 0x21d77072e90>
In [71]:
prophet_predictions = prophet_model.predict(test_data.reset_index().rename(columns={'datetime': 'ds', 'trip_count': 'y'}))
prophet_predictions2 = prophet_predictions['yhat'].values

prophet_mse = mean_squared_error(test_data['trip_count'], prophet_predictions2)

print("Prophet Model Performance:")
print(f"MSE: {prophet_mse}")
Prophet Model Performance:
MSE: 775593.7134831949
In [72]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=train_data.index, y=train_data['trip_count'], name='Trip Count'))
fig.add_trace(go.Scatter(x=test_data.index, y=test_data['trip_count'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=test_data.index, y=prophet_predictions2, mode='lines', name='Prophet'))

fig.update_layout(
    xaxis=dict(title='Time'),
    yaxis=dict(title='Trip Count'),
    legend=dict(x=0, y=1),
)

fig.update_xaxes(tickformat="%m")

fig.show()
In [73]:
fig = prophet_model.plot_components(prophet_predictions)